# Note that this code is for sampling 150 random passages for Poetry and Non-Fiction

import os
import re
import utils
path = '/Users/sunyambagga/Desktop/txtLAB-2/detecting-narrativity/data/'

def poetry():
    """
    Sample 120 random passages from Poetry (60 from 19C and 60 from 20C)
    We remove all digits (line numbers). All 120 are from diffferent authors
    """
    AUTHORS_SAMPLED = set()
    p = path+'historical-corpora/Poetry/19CPoetryAll/'
    k = 0
    print("Running for Poetry 19C... Total fnames:", len(os.listdir(p)))
    fnames = os.listdir(p)
    random.shuffle(fnames)
    for fname in fnames:
        if fname.startswith('.'): continue

        a = fname.split('_')[1]
        if a in AUTHORS_SAMPLED:
            continue
        AUTHORS_SAMPLED.add(a)

        with open(p+fname, 'r') as FILE:
            txt = FILE.read()
        random_5s = utils.sample_5_sents(txt, N_SAMPLES=1)
        if len(random_5s) != 1 or len(random_5s[0]) > 1000:
            continue
        text = random_5s[0]
        if text.startswith("'") or text.startswith('"'):
            continue
        print(fname)
        clean_text = ''.join([i for i in text if not i.isdigit()]) # remove numbers
        with open(path+'hand-annotate/poetry/POETRY_5S_19C_'+re.sub(r'\W+', '-', fname[:-4])+'.txt', 'w') as F:
            F.write(clean_text)
        k += 1
        if k == 60:
            break

    p = path+'historical-corpora/Poetry/20CPoetryAll/'
    k = 0
    print("\n\n\n\nRunning for Poetry 20C... Total fnames:", len(os.listdir(p)))
    fnames = os.listdir(p)
    random.shuffle(fnames)
    for fname in fnames:
        if fname.startswith('.'): continue

        a = fname.split('_')[1]
        if a in AUTHORS_SAMPLED:
            continue
        AUTHORS_SAMPLED.add(a)

        with open(p+fname, 'r') as FILE:
            txt = FILE.read()
        random_5s = utils.sample_5_sents(txt, N_SAMPLES=1)
        if len(random_5s) != 1 or len(random_5s[0]) > 1000:
            continue
        text = random_5s[0]
        if text.startswith("'") or text.startswith('"'):
            continue
        print(fname)
        clean_text = ''.join([i for i in text if not i.isdigit()]) # remove numbers
        with open(path+'hand-annotate/poetry/POETRY_5S_20C_'+re.sub(r'\W+', '-', fname[:-4])+'.txt', 'w') as F:
            F.write(clean_text)
        k += 1
        if k == 60:
            break
            
            
def nonfic():
    """
    Sample 150 random passages from Hathi non-fiction.
    We skip the first 2000 characters to avoid sampling noisy passages.
    """
    N_SAMPLES = 1 # samples per work
    
    p = path+'historical-corpora/19C_Underwood_NonFiction0/'
    k = 0
    print("Running for Non-Fiction... Total fnames:", len(os.listdir(p)))
    fnames = os.listdir(p)
    random.shuffle(fnames)
    for fname in fnames:
        if fname.startswith('.'): continue        

        with open(p+fname, 'r') as FILE:
            txt = FILE.read()
        random_5s = utils.sample_5_sents(txt, N_SAMPLES, skip_initial_chars=2000)
        if len(random_5s) != 1 or len(random_5s[0]) > 1000 or len(random_5s[0]) < 200:
            continue
        text = random_5s[0]
        with open(path+'hand-annotate/nonfic/19CNONFIC_5S_'+re.sub(r'\W+', '-', fname[:-4])+'.txt', 'w') as F:
            F.write(text)
        k += 1
        if k == 150:
            break
            

def science():
    """
    Sample 150 random passages from the Royal Society Corpus.
    See royal_corpus_txt.py to convert 
    """
    N_SAMPLES = 1 # samples per work
    
    p = path+'historical-corpora/royal-society-corpus-txts/'
    k = 0
    print("Running for Science... Total fnames:", len(os.listdir(p)))
    fnames = os.listdir(p)
    random.shuffle(fnames)
    for fname in fnames:
        if fname.startswith('.'): continue        

        with open(p+fname, 'r') as FILE:
            txt = FILE.read()
        random_5s = utils.sample_5_sents(txt, N_SAMPLES)
        if len(random_5s) != 1 or len(random_5s[0]) > 1000 or len(random_5s[0]) < 200:
            continue
        text = random_5s[0]
        if text.startswith(
        with open(path+'hand-annotate/science/SCIENCE_5S_'+re.sub(r'\W+', '-', fname[:-4])+'.txt', 'w') as F:
            F.write(text)
        k += 1
        if k == 150:
            break
            
# poetry()
# nonfic()
science()